library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(patchwork)
library("leaps")
library(faraway)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:faraway':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(broom)
cancer_reg = read_csv("./data/Cancer_Registry.csv") %>%
janitor::clean_names() %>%
dplyr::select(target_death_rate, everything()) %>%
separate(geography, into = c("county", "state"), sep = ",")
## Parsed with column specification:
## cols(
## .default = col_double(),
## avgDeathsPerYear = col_integer(),
## medIncome = col_integer(),
## popEst2015 = col_integer(),
## binnedInc = col_character(),
## Geography = col_character()
## )
## See spec(...) for full column specifications.
There are in total 35 variables and 3047observations in the dataset.
Our outcome of interest is target_death_rate
#missing data
#colSums(is.na(cancer_reg))
##pct_some_col18_24 has 2285 NAs, pct_employed_coverage_alone has 609 NA, pct_employed16_over has 152 NAs
missing_value = sapply(cancer_reg[1:34], function(x) sum(length(which(is.na(x)))))
# Percentage of missing value
percentage_missing = sapply(cancer_reg[1:34], function(x) sum(length(which(is.na(x)))) / nrow(cancer_reg))
percentage_missing %>% data.frame()
## .
## target_death_rate 0.00000000
## avg_ann_count 0.00000000
## avg_deaths_per_year 0.00000000
## incidence_rate 0.00000000
## med_income 0.00000000
## pop_est2015 0.00000000
## poverty_percent 0.00000000
## study_per_cap 0.00000000
## binned_inc 0.00000000
## median_age 0.00000000
## median_age_male 0.00000000
## median_age_female 0.00000000
## county 0.00000000
## state 0.00000000
## avg_household_size 0.00000000
## percent_married 0.00000000
## pct_no_hs18_24 0.00000000
## pct_hs18_24 0.00000000
## pct_some_col18_24 0.74991795
## pct_bach_deg18_24 0.00000000
## pct_hs25_over 0.00000000
## pct_bach_deg25_over 0.00000000
## pct_employed16_over 0.04988513
## pct_unemployed16_over 0.00000000
## pct_private_coverage 0.00000000
## pct_private_coverage_alone 0.19986872
## pct_emp_priv_coverage 0.00000000
## pct_public_coverage 0.00000000
## pct_public_coverage_alone 0.00000000
## pct_white 0.00000000
## pct_black 0.00000000
## pct_asian 0.00000000
## pct_other_race 0.00000000
## pct_married_households 0.00000000
##getting rid of variables with missing values.
cancer_reg = cancer_reg %>% dplyr::select(-pct_some_col18_24, -pct_private_coverage_alone, -binned_inc, -median_age)
##removed binned_inc since we already have median income and median age since it is avg of median age female and male. so we'll build model with those those factors instead.
##percentage missing for pct_employed16_over is ~5%, checking to see if its correlated with the outcome
reg = lm(target_death_rate~pct_employed16_over, data = cancer_reg) %>%
summary()
##Since the p-value is small, we will retain pct_employed16_over
cancer_reg = cancer_reg %>% select(-county, -state) %>%
mutate(mortality = avg_deaths_per_year/pop_est2015, prevalence = avg_ann_count/pop_est2015) %>%
dplyr::select(-pop_est2015, -avg_ann_count, -avg_deaths_per_year) %>%
mutate(study_per_cap =
as.factor(ifelse(study_per_cap == 0, "none",
ifelse(study_per_cap < quantile(study_per_cap, .25), "low",
ifelse(study_per_cap < quantile(study_per_cap, .5), "medium" ,
ifelse(study_per_cap < quantile(study_per_cap, .75), "high", "very high")))))) %>%
mutate(pct_non_white = pct_black+ pct_asian + pct_other_race) %>%
dplyr::select(-pct_black, -pct_asian, -pct_other_race) ##since the number of white people are a lot higher, putting the other minorities under a single variable.
##Since count itself can be misleading, taking proportion will give us a better model.
##removed state and county variables since we're building a predictive model, area doesn't really matter.
##changed study_per_cap to factor variable
hist(cancer_reg$target_death_rate) #outcome is normally distributed
hist(cancer_reg$pct_private_coverage)
hist(cancer_reg$pct_public_coverage)
hist(cancer_reg$pct_emp_priv_coverage)
hist(cancer_reg$pct_public_coverage_alone)
hist(cancer_reg$incidence_rate) ##right skewed
hist(cancer_reg$med_income) #somewhat right skewed-mostly ok
hist(cancer_reg$poverty_percent)
hist(cancer_reg$median_age_male)
hist(cancer_reg$median_age_female)
hist(cancer_reg$avg_household_size) ##left skewed
hist(cancer_reg$percent_married)
hist(cancer_reg$pct_no_hs18_24) #somehwat right
hist(cancer_reg$pct_hs18_24)
hist(cancer_reg$pct_bach_deg18_24)#right skewed
hist(cancer_reg$pct_hs25_over)
hist(cancer_reg$pct_bach_deg25_over)
hist(cancer_reg$pct_employed16_over)
hist(cancer_reg$pct_unemployed16_over)
hist(cancer_reg$pct_white) #left skewed
hist(cancer_reg$pct_non_white) #right skewed
hist(cancer_reg$pct_married_households)
hist(cancer_reg$birth_rate)
hist(cancer_reg$mortality)
hist(cancer_reg$prevalence) #right skewed
##they are all almost normally distributed
cont_var = dplyr::select(cancer_reg, target_death_rate, everything(), -c(study_per_cap))
knitr::kable(summary(cont_var), caption = "descriptive statistics for continuous variables")
| target_death_rate | incidence_rate | med_income | poverty_percent | median_age_male | median_age_female | avg_household_size | percent_married | pct_no_hs18_24 | pct_hs18_24 | pct_bach_deg18_24 | pct_hs25_over | pct_bach_deg25_over | pct_employed16_over | pct_unemployed16_over | pct_private_coverage | pct_emp_priv_coverage | pct_public_coverage | pct_public_coverage_alone | pct_white | pct_married_households | birth_rate | mortality | prevalence | pct_non_white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 59.7 | Min. : 201.3 | Min. : 22640 | Min. : 3.20 | Min. :22.40 | Min. :22.30 | Min. :0.0221 | Min. :23.10 | Min. : 0.00 | Min. : 0.0 | Min. : 0.000 | Min. : 7.50 | Min. : 2.50 | Min. :17.60 | Min. : 0.400 | Min. :22.30 | Min. :13.5 | Min. :11.20 | Min. : 2.60 | Min. : 10.20 | Min. :22.99 | Min. : 0.000 | Min. :0.000485 | Min. :0.0009281 | Min. : 0.000 | |
| 1st Qu.:161.2 | 1st Qu.: 420.3 | 1st Qu.: 38882 | 1st Qu.:12.15 | 1st Qu.:36.35 | 1st Qu.:39.10 | 1st Qu.:2.3700 | 1st Qu.:47.75 | 1st Qu.:12.80 | 1st Qu.:29.2 | 1st Qu.: 3.100 | 1st Qu.:30.40 | 1st Qu.: 9.40 | 1st Qu.:48.60 | 1st Qu.: 5.500 | 1st Qu.:57.20 | 1st Qu.:34.5 | 1st Qu.:30.90 | 1st Qu.:14.85 | 1st Qu.: 77.30 | 1st Qu.:47.76 | 1st Qu.: 4.521 | 1st Qu.:0.001888 | 1st Qu.:0.0048022 | 1st Qu.: 1.964 | |
| Median :178.1 | Median : 453.5 | Median : 45207 | Median :15.90 | Median :39.60 | Median :42.40 | Median :2.5000 | Median :52.40 | Median :17.10 | Median :34.7 | Median : 5.400 | Median :35.30 | Median :12.30 | Median :54.50 | Median : 7.600 | Median :65.10 | Median :41.1 | Median :36.30 | Median :18.80 | Median : 90.06 | Median :51.67 | Median : 5.381 | Median :0.002290 | Median :0.0056236 | Median : 5.569 | |
| Mean :178.7 | Mean : 448.3 | Mean : 47063 | Mean :16.88 | Mean :39.57 | Mean :42.15 | Mean :2.4797 | Mean :51.77 | Mean :18.22 | Mean :35.0 | Mean : 6.158 | Mean :34.80 | Mean :13.28 | Mean :54.15 | Mean : 7.852 | Mean :64.35 | Mean :41.2 | Mean :36.25 | Mean :19.24 | Mean : 83.65 | Mean :51.24 | Mean : 5.640 | Mean :0.002287 | Mean :0.0232443 | Mean :12.345 | |
| 3rd Qu.:195.2 | 3rd Qu.: 480.9 | 3rd Qu.: 52492 | 3rd Qu.:20.40 | 3rd Qu.:42.50 | 3rd Qu.:45.30 | 3rd Qu.:2.6300 | 3rd Qu.:56.40 | 3rd Qu.:22.70 | 3rd Qu.:40.7 | 3rd Qu.: 8.200 | 3rd Qu.:39.65 | 3rd Qu.:16.10 | 3rd Qu.:60.30 | 3rd Qu.: 9.700 | 3rd Qu.:72.10 | 3rd Qu.:47.7 | 3rd Qu.:41.55 | 3rd Qu.:23.10 | 3rd Qu.: 95.45 | 3rd Qu.:55.40 | 3rd Qu.: 6.494 | 3rd Qu.:0.002681 | 3rd Qu.:0.0064874 | 3rd Qu.:16.974 | |
| Max. :362.8 | Max. :1206.9 | Max. :125635 | Max. :47.40 | Max. :64.70 | Max. :65.70 | Max. :3.9700 | Max. :72.50 | Max. :64.10 | Max. :72.5 | Max. :51.800 | Max. :54.80 | Max. :42.20 | Max. :80.10 | Max. :29.400 | Max. :92.30 | Max. :70.7 | Max. :65.10 | Max. :46.60 | Max. :100.00 | Max. :78.08 | Max. :21.326 | Max. :0.005136 | Max. :2.3675123 | Max. :86.066 | |
| NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :152 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
cancer_reg %>%
group_by(study_per_cap) %>%
count() %>%
ungroup() %>%
mutate(prop = n / sum(n)) %>%
knitr::kable(digits = 2, caption = "Descriptive Statistics for clinical trial")
| study_per_cap | n | prop |
|---|---|---|
| high | 354 | 0.12 |
| none | 1931 | 0.63 |
| very high | 762 | 0.25 |
(not considering interaction)
multi.fit = lm(target_death_rate ~ ., data = cancer_reg)
summary(multi.fit)
##
## Call:
## lm(formula = target_death_rate ~ ., data = cancer_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.182 -7.532 0.058 7.141 84.257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.032e+02 1.058e+01 19.206 < 2e-16 ***
## incidence_rate 8.692e-02 5.297e-03 16.410 < 2e-16 ***
## med_income 2.619e-04 5.343e-05 4.902 1.00e-06 ***
## poverty_percent -3.129e-03 1.075e-01 -0.029 0.976775
## study_per_capnone -9.543e-01 8.107e-01 -1.177 0.239270
## study_per_capvery high -1.389e+00 8.532e-01 -1.628 0.103542
## median_age_male -6.012e-01 1.410e-01 -4.263 2.08e-05 ***
## median_age_female -2.264e+00 1.499e-01 -15.100 < 2e-16 ***
## avg_household_size -1.197e-02 6.501e-01 -0.018 0.985307
## percent_married 1.174e-02 1.177e-01 0.100 0.920606
## pct_no_hs18_24 -1.113e-03 3.771e-02 -0.030 0.976454
## pct_hs18_24 2.967e-01 3.324e-02 8.927 < 2e-16 ***
## pct_bach_deg18_24 3.625e-02 7.233e-02 0.501 0.616350
## pct_hs25_over 8.539e-02 6.429e-02 1.328 0.184219
## pct_bach_deg25_over -3.547e-01 1.052e-01 -3.373 0.000753 ***
## pct_employed16_over -6.641e-01 7.320e-02 -9.072 < 2e-16 ***
## pct_unemployed16_over 7.958e-01 1.121e-01 7.099 1.58e-12 ***
## pct_private_coverage -2.716e-01 8.736e-02 -3.109 0.001896 **
## pct_emp_priv_coverage 4.285e-01 7.042e-02 6.085 1.32e-09 ***
## pct_public_coverage -1.964e+00 1.544e-01 -12.720 < 2e-16 ***
## pct_public_coverage_alone 1.802e+00 1.904e-01 9.464 < 2e-16 ***
## pct_white -1.341e-01 3.874e-02 -3.460 0.000548 ***
## pct_married_households 5.822e-02 1.134e-01 0.513 0.607691
## birth_rate -5.153e-01 1.292e-01 -3.989 6.81e-05 ***
## mortality 4.949e+04 7.916e+02 62.524 < 2e-16 ***
## prevalence -1.699e+01 2.422e+00 -7.013 2.89e-12 ***
## pct_non_white -4.812e-02 3.746e-02 -1.284 0.199120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.71 on 2868 degrees of freedom
## (152 observations deleted due to missingness)
## Multiple R-squared: 0.7877, Adjusted R-squared: 0.7857
## F-statistic: 409.2 on 26 and 2868 DF, p-value: < 2.2e-16
cancer_subset = cancer_reg %>%
select(target_death_rate, incidence_rate, med_income, median_age_male, median_age_female, pct_hs18_24, pct_bach_deg25_over, pct_employed16_over, pct_unemployed16_over, pct_public_coverage_alone, birth_rate, mortality, prevalence, pct_private_coverage, pct_emp_priv_coverage, pct_public_coverage )
reg1 = lm(target_death_rate~pct_private_coverage, data = cancer_reg) %>%
summary()
reg2 = lm(target_death_rate~pct_emp_priv_coverage, data = cancer_reg) %>%
summary()
reg3 = lm(target_death_rate~pct_public_coverage, data = cancer_reg) %>%
summary()
reg4 = lm(target_death_rate~pct_public_coverage_alone , data = cancer_reg) %>%
summary()
##I'd pick public_coverage alone since it has max r^2
plot(cancer_reg$pct_private_coverage, cancer_reg$target_death_rate)
abline(reg1,lwd = 2,col = 2)
## Warning in abline(reg1, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
plot(cancer_reg$pct_emp_priv_coverage, cancer_reg$target_death_rate)
abline(reg2,lwd = 2,col = 2)
## Warning in abline(reg2, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
plot(cancer_reg$pct_public_coverage, cancer_reg$target_death_rate)
abline(reg3,lwd = 2,col = 2)
## Warning in abline(reg3, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
plot(cancer_reg$pct_public_coverage_alone, cancer_reg$target_death_rate)
abline(reg4,lwd = 2,col = 2)
## Warning in abline(reg4, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
# Fit MLR with cancer_subset
canc.fit <- lm(target_death_rate ~ ., data = cancer_reg)
# Automatic procedure: Stepwise regression to select model
step.fit <- step(canc.fit, direction='backward')
## Start: AIC=14748.59
## target_death_rate ~ incidence_rate + med_income + poverty_percent +
## study_per_cap + median_age_male + median_age_female + avg_household_size +
## percent_married + pct_no_hs18_24 + pct_hs18_24 + pct_bach_deg18_24 +
## pct_hs25_over + pct_bach_deg25_over + pct_employed16_over +
## pct_unemployed16_over + pct_private_coverage + pct_emp_priv_coverage +
## pct_public_coverage + pct_public_coverage_alone + pct_white +
## pct_married_households + birth_rate + mortality + prevalence +
## pct_non_white
##
## Df Sum of Sq RSS AIC
## - avg_household_size 1 0 463514 14747
## - poverty_percent 1 0 463515 14747
## - pct_no_hs18_24 1 0 463515 14747
## - percent_married 1 2 463516 14747
## - pct_bach_deg18_24 1 41 463555 14747
## - pct_married_households 1 43 463557 14747
## - study_per_cap 2 429 463944 14747
## - pct_non_white 1 267 463781 14748
## - pct_hs25_over 1 285 463799 14748
## <none> 463514 14749
## - pct_private_coverage 1 1562 465076 14756
## - pct_bach_deg25_over 1 1839 465353 14758
## - pct_white 1 1935 465449 14759
## - birth_rate 1 2571 466086 14763
## - median_age_male 1 2937 466451 14765
## - med_income 1 3884 467398 14771
## - pct_emp_priv_coverage 1 5983 469498 14784
## - prevalence 1 7949 471463 14796
## - pct_unemployed16_over 1 8145 471659 14797
## - pct_hs18_24 1 12880 476394 14826
## - pct_employed16_over 1 13302 476816 14828
## - pct_public_coverage_alone 1 14475 477990 14836
## - pct_public_coverage 1 26151 489666 14906
## - median_age_female 1 36852 500367 14968
## - incidence_rate 1 43521 507035 15006
## - mortality 1 631789 1095304 17236
##
## Step: AIC=14746.59
## target_death_rate ~ incidence_rate + med_income + poverty_percent +
## study_per_cap + median_age_male + median_age_female + percent_married +
## pct_no_hs18_24 + pct_hs18_24 + pct_bach_deg18_24 + pct_hs25_over +
## pct_bach_deg25_over + pct_employed16_over + pct_unemployed16_over +
## pct_private_coverage + pct_emp_priv_coverage + pct_public_coverage +
## pct_public_coverage_alone + pct_white + pct_married_households +
## birth_rate + mortality + prevalence + pct_non_white
##
## Df Sum of Sq RSS AIC
## - poverty_percent 1 0 463515 14745
## - pct_no_hs18_24 1 0 463515 14745
## - percent_married 1 2 463516 14745
## - pct_bach_deg18_24 1 41 463555 14745
## - pct_married_households 1 44 463559 14745
## - study_per_cap 2 429 463944 14745
## - pct_non_white 1 267 463781 14746
## - pct_hs25_over 1 285 463800 14746
## <none> 463514 14747
## - pct_private_coverage 1 1567 465082 14754
## - pct_bach_deg25_over 1 1840 465354 14756
## - pct_white 1 1945 465460 14757
## - birth_rate 1 2574 466088 14761
## - median_age_male 1 2937 466452 14763
## - med_income 1 3898 467412 14769
## - pct_emp_priv_coverage 1 5987 469501 14782
## - prevalence 1 7951 471465 14794
## - pct_unemployed16_over 1 8162 471676 14795
## - pct_hs18_24 1 12913 476428 14824
## - pct_employed16_over 1 13363 476877 14827
## - pct_public_coverage_alone 1 14476 477991 14834
## - pct_public_coverage 1 26152 489666 14904
## - median_age_female 1 37059 500574 14967
## - incidence_rate 1 43652 507167 15005
## - mortality 1 631789 1095304 17234
##
## Step: AIC=14744.59
## target_death_rate ~ incidence_rate + med_income + study_per_cap +
## median_age_male + median_age_female + percent_married + pct_no_hs18_24 +
## pct_hs18_24 + pct_bach_deg18_24 + pct_hs25_over + pct_bach_deg25_over +
## pct_employed16_over + pct_unemployed16_over + pct_private_coverage +
## pct_emp_priv_coverage + pct_public_coverage + pct_public_coverage_alone +
## pct_white + pct_married_households + birth_rate + mortality +
## prevalence + pct_non_white
##
## Df Sum of Sq RSS AIC
## - pct_no_hs18_24 1 0 463515 14743
## - percent_married 1 2 463516 14743
## - pct_bach_deg18_24 1 40 463555 14743
## - pct_married_households 1 45 463559 14743
## - study_per_cap 2 429 463944 14743
## - pct_non_white 1 270 463785 14744
## - pct_hs25_over 1 286 463801 14744
## <none> 463515 14745
## - pct_private_coverage 1 1597 465112 14753
## - pct_bach_deg25_over 1 1859 465374 14754
## - pct_white 1 1947 465462 14755
## - birth_rate 1 2579 466094 14759
## - median_age_male 1 2943 466458 14761
## - med_income 1 5024 468538 14774
## - pct_emp_priv_coverage 1 6003 469518 14780
## - prevalence 1 7951 471465 14792
## - pct_unemployed16_over 1 8163 471677 14793
## - pct_hs18_24 1 12962 476477 14822
## - pct_public_coverage_alone 1 14683 478197 14833
## - pct_employed16_over 1 15098 478613 14835
## - pct_public_coverage 1 26245 489759 14902
## - median_age_female 1 37452 500966 14968
## - incidence_rate 1 43655 507170 15003
## - mortality 1 633305 1096819 17236
##
## Step: AIC=14742.59
## target_death_rate ~ incidence_rate + med_income + study_per_cap +
## median_age_male + median_age_female + percent_married + pct_hs18_24 +
## pct_bach_deg18_24 + pct_hs25_over + pct_bach_deg25_over +
## pct_employed16_over + pct_unemployed16_over + pct_private_coverage +
## pct_emp_priv_coverage + pct_public_coverage + pct_public_coverage_alone +
## pct_white + pct_married_households + birth_rate + mortality +
## prevalence + pct_non_white
##
## Df Sum of Sq RSS AIC
## - percent_married 1 2 463516 14741
## - pct_bach_deg18_24 1 42 463557 14741
## - pct_married_households 1 45 463559 14741
## - study_per_cap 2 432 463947 14741
## - pct_non_white 1 270 463785 14742
## - pct_hs25_over 1 288 463802 14742
## <none> 463515 14743
## - pct_private_coverage 1 1692 465207 14751
## - pct_bach_deg25_over 1 1874 465388 14752
## - pct_white 1 1957 465472 14753
## - birth_rate 1 2588 466103 14757
## - median_age_male 1 2945 466459 14759
## - med_income 1 5144 468658 14772
## - pct_emp_priv_coverage 1 6004 469519 14778
## - prevalence 1 7969 471484 14790
## - pct_unemployed16_over 1 8228 471743 14792
## - pct_hs18_24 1 13894 477409 14826
## - pct_public_coverage_alone 1 14766 478281 14831
## - pct_employed16_over 1 15147 478662 14834
## - pct_public_coverage 1 26245 489760 14900
## - median_age_female 1 37967 501482 14968
## - incidence_rate 1 43723 507238 15002
## - mortality 1 637651 1101166 17246
##
## Step: AIC=14740.61
## target_death_rate ~ incidence_rate + med_income + study_per_cap +
## median_age_male + median_age_female + pct_hs18_24 + pct_bach_deg18_24 +
## pct_hs25_over + pct_bach_deg25_over + pct_employed16_over +
## pct_unemployed16_over + pct_private_coverage + pct_emp_priv_coverage +
## pct_public_coverage + pct_public_coverage_alone + pct_white +
## pct_married_households + birth_rate + mortality + prevalence +
## pct_non_white
##
## Df Sum of Sq RSS AIC
## - pct_bach_deg18_24 1 43 463560 14739
## - study_per_cap 2 431 463947 14739
## - pct_married_households 1 196 463713 14740
## - pct_non_white 1 271 463788 14740
## - pct_hs25_over 1 286 463803 14740
## <none> 463516 14741
## - pct_private_coverage 1 1690 465207 14749
## - pct_bach_deg25_over 1 1885 465401 14750
## - pct_white 1 1959 465475 14751
## - birth_rate 1 2621 466138 14755
## - median_age_male 1 3046 466562 14758
## - med_income 1 5203 468720 14771
## - pct_emp_priv_coverage 1 6124 469640 14777
## - prevalence 1 7972 471488 14788
## - pct_unemployed16_over 1 8235 471752 14790
## - pct_hs18_24 1 14029 477545 14825
## - pct_public_coverage_alone 1 14773 478289 14829
## - pct_employed16_over 1 21243 484760 14868
## - pct_public_coverage 1 26303 489820 14898
## - median_age_female 1 37974 501490 14967
## - incidence_rate 1 43768 507284 15000
## - mortality 1 661380 1124896 17305
##
## Step: AIC=14738.88
## target_death_rate ~ incidence_rate + med_income + study_per_cap +
## median_age_male + median_age_female + pct_hs18_24 + pct_hs25_over +
## pct_bach_deg25_over + pct_employed16_over + pct_unemployed16_over +
## pct_private_coverage + pct_emp_priv_coverage + pct_public_coverage +
## pct_public_coverage_alone + pct_white + pct_married_households +
## birth_rate + mortality + prevalence + pct_non_white
##
## Df Sum of Sq RSS AIC
## - study_per_cap 2 434 463994 14738
## - pct_married_households 1 169 463729 14738
## - pct_non_white 1 261 463821 14738
## - pct_hs25_over 1 304 463863 14739
## <none> 463560 14739
## - pct_private_coverage 1 1651 465210 14747
## - pct_bach_deg25_over 1 1846 465406 14748
## - pct_white 1 1930 465490 14749
## - birth_rate 1 2677 466237 14754
## - median_age_male 1 3010 466569 14756
## - med_income 1 5417 468977 14770
## - pct_emp_priv_coverage 1 6089 469649 14775
## - prevalence 1 7932 471491 14786
## - pct_unemployed16_over 1 8194 471754 14788
## - pct_hs18_24 1 14305 477864 14825
## - pct_public_coverage_alone 1 15143 478703 14830
## - pct_employed16_over 1 21206 484766 14866
## - pct_public_coverage 1 26596 490156 14898
## - median_age_female 1 37989 501549 14965
## - incidence_rate 1 43748 507308 14998
## - mortality 1 661336 1124896 17303
##
## Step: AIC=14737.59
## target_death_rate ~ incidence_rate + med_income + median_age_male +
## median_age_female + pct_hs18_24 + pct_hs25_over + pct_bach_deg25_over +
## pct_employed16_over + pct_unemployed16_over + pct_private_coverage +
## pct_emp_priv_coverage + pct_public_coverage + pct_public_coverage_alone +
## pct_white + pct_married_households + birth_rate + mortality +
## prevalence + pct_non_white
##
## Df Sum of Sq RSS AIC
## - pct_married_households 1 200 464194 14737
## - pct_non_white 1 225 464219 14737
## <none> 463994 14738
## - pct_hs25_over 1 331 464325 14738
## - pct_private_coverage 1 1657 465651 14746
## - pct_white 1 1867 465861 14747
## - pct_bach_deg25_over 1 1888 465882 14747
## - birth_rate 1 2763 466757 14753
## - median_age_male 1 2990 466984 14754
## - med_income 1 5612 469606 14770
## - pct_emp_priv_coverage 1 6006 470000 14773
## - prevalence 1 7965 471958 14785
## - pct_unemployed16_over 1 8349 472343 14787
## - pct_hs18_24 1 14221 478215 14823
## - pct_public_coverage_alone 1 15134 479128 14828
## - pct_employed16_over 1 21337 485331 14866
## - pct_public_coverage 1 26622 490616 14897
## - median_age_female 1 38248 502242 14965
## - incidence_rate 1 44187 508181 14999
## - mortality 1 661507 1125501 17301
##
## Step: AIC=14736.83
## target_death_rate ~ incidence_rate + med_income + median_age_male +
## median_age_female + pct_hs18_24 + pct_hs25_over + pct_bach_deg25_over +
## pct_employed16_over + pct_unemployed16_over + pct_private_coverage +
## pct_emp_priv_coverage + pct_public_coverage + pct_public_coverage_alone +
## pct_white + birth_rate + mortality + prevalence + pct_non_white
##
## Df Sum of Sq RSS AIC
## - pct_non_white 1 262 464456 14736
## <none> 464194 14737
## - pct_hs25_over 1 339 464533 14737
## - pct_private_coverage 1 1573 465767 14745
## - pct_white 1 1683 465877 14745
## - pct_bach_deg25_over 1 2435 466629 14750
## - birth_rate 1 2693 466887 14752
## - median_age_male 1 2882 467076 14753
## - pct_emp_priv_coverage 1 5806 470000 14771
## - med_income 1 7619 471813 14782
## - prevalence 1 7936 472130 14784
## - pct_unemployed16_over 1 8183 472377 14785
## - pct_hs18_24 1 14575 478769 14824
## - pct_public_coverage_alone 1 15425 479619 14830
## - pct_employed16_over 1 22509 486703 14872
## - pct_public_coverage 1 28133 492327 14905
## - median_age_female 1 38150 502344 14964
## - incidence_rate 1 44051 508245 14997
## - mortality 1 668614 1132808 17318
##
## Step: AIC=14736.47
## target_death_rate ~ incidence_rate + med_income + median_age_male +
## median_age_female + pct_hs18_24 + pct_hs25_over + pct_bach_deg25_over +
## pct_employed16_over + pct_unemployed16_over + pct_private_coverage +
## pct_emp_priv_coverage + pct_public_coverage + pct_public_coverage_alone +
## pct_white + birth_rate + mortality + prevalence
##
## Df Sum of Sq RSS AIC
## <none> 464456 14736
## - pct_hs25_over 1 427 464883 14737
## - pct_private_coverage 1 1717 466173 14745
## - pct_bach_deg25_over 1 2280 466736 14749
## - birth_rate 1 2564 467020 14750
## - pct_white 1 2736 467192 14752
## - median_age_male 1 2781 467238 14752
## - pct_emp_priv_coverage 1 5646 470102 14769
## - prevalence 1 7829 472285 14783
## - pct_unemployed16_over 1 8117 472573 14785
## - med_income 1 8262 472719 14786
## - pct_hs18_24 1 14840 479297 14826
## - pct_public_coverage_alone 1 15181 479637 14828
## - pct_employed16_over 1 22264 486720 14870
## - pct_public_coverage 1 27887 492343 14903
## - median_age_female 1 40509 504965 14977
## - incidence_rate 1 43795 508251 14995
## - mortality 1 669213 1133669 17318
# Summary of models for each size
b<-regsubsets(target_death_rate ~ ., nvmax = 26, data=cancer_reg)
(rs<-summary(b))
## Subset selection object
## Call: regsubsets.formula(target_death_rate ~ ., nvmax = 26, data = cancer_reg)
## 26 Variables (and intercept)
## Forced in Forced out
## incidence_rate FALSE FALSE
## med_income FALSE FALSE
## poverty_percent FALSE FALSE
## study_per_capnone FALSE FALSE
## study_per_capvery high FALSE FALSE
## median_age_male FALSE FALSE
## median_age_female FALSE FALSE
## avg_household_size FALSE FALSE
## percent_married FALSE FALSE
## pct_no_hs18_24 FALSE FALSE
## pct_hs18_24 FALSE FALSE
## pct_bach_deg18_24 FALSE FALSE
## pct_hs25_over FALSE FALSE
## pct_bach_deg25_over FALSE FALSE
## pct_employed16_over FALSE FALSE
## pct_unemployed16_over FALSE FALSE
## pct_private_coverage FALSE FALSE
## pct_emp_priv_coverage FALSE FALSE
## pct_public_coverage FALSE FALSE
## pct_public_coverage_alone FALSE FALSE
## pct_white FALSE FALSE
## pct_married_households FALSE FALSE
## birth_rate FALSE FALSE
## mortality FALSE FALSE
## prevalence FALSE FALSE
## pct_non_white FALSE FALSE
## 1 subsets of each size up to 26
## Selection Algorithm: exhaustive
## incidence_rate med_income poverty_percent study_per_capnone
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) "*" " " " " " "
## 5 ( 1 ) "*" " " " " " "
## 6 ( 1 ) "*" " " " " " "
## 7 ( 1 ) "*" " " " " " "
## 8 ( 1 ) "*" " " " " " "
## 9 ( 1 ) "*" " " " " " "
## 10 ( 1 ) "*" " " " " " "
## 11 ( 1 ) "*" " " " " " "
## 12 ( 1 ) "*" "*" " " " "
## 13 ( 1 ) "*" "*" " " " "
## 14 ( 1 ) "*" "*" " " " "
## 15 ( 1 ) "*" "*" " " " "
## 16 ( 1 ) "*" "*" " " " "
## 17 ( 1 ) "*" "*" " " " "
## 18 ( 1 ) "*" "*" " " " "
## 19 ( 1 ) "*" "*" " " " "
## 20 ( 1 ) "*" "*" " " "*"
## 21 ( 1 ) "*" "*" " " "*"
## 22 ( 1 ) "*" "*" " " "*"
## 23 ( 1 ) "*" "*" " " "*"
## 24 ( 1 ) "*" "*" " " "*"
## 25 ( 1 ) "*" "*" "*" "*"
## 26 ( 1 ) "*" "*" "*" "*"
## study_per_capvery high median_age_male median_age_female
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " "*"
## 3 ( 1 ) " " " " "*"
## 4 ( 1 ) " " " " "*"
## 5 ( 1 ) " " " " "*"
## 6 ( 1 ) " " " " "*"
## 7 ( 1 ) " " " " "*"
## 8 ( 1 ) " " " " "*"
## 9 ( 1 ) " " " " "*"
## 10 ( 1 ) " " " " "*"
## 11 ( 1 ) " " " " "*"
## 12 ( 1 ) " " " " "*"
## 13 ( 1 ) " " " " "*"
## 14 ( 1 ) " " " " "*"
## 15 ( 1 ) " " "*" "*"
## 16 ( 1 ) " " "*" "*"
## 17 ( 1 ) " " "*" "*"
## 18 ( 1 ) " " "*" "*"
## 19 ( 1 ) "*" "*" "*"
## 20 ( 1 ) "*" "*" "*"
## 21 ( 1 ) "*" "*" "*"
## 22 ( 1 ) "*" "*" "*"
## 23 ( 1 ) "*" "*" "*"
## 24 ( 1 ) "*" "*" "*"
## 25 ( 1 ) "*" "*" "*"
## 26 ( 1 ) "*" "*" "*"
## avg_household_size percent_married pct_no_hs18_24 pct_hs18_24
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " "*"
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " "*"
## 8 ( 1 ) " " " " " " "*"
## 9 ( 1 ) " " " " " " "*"
## 10 ( 1 ) " " " " " " "*"
## 11 ( 1 ) " " " " " " "*"
## 12 ( 1 ) " " " " " " "*"
## 13 ( 1 ) " " " " " " "*"
## 14 ( 1 ) " " " " " " "*"
## 15 ( 1 ) " " " " " " "*"
## 16 ( 1 ) " " " " " " "*"
## 17 ( 1 ) " " " " " " "*"
## 18 ( 1 ) " " " " " " "*"
## 19 ( 1 ) " " " " " " "*"
## 20 ( 1 ) " " " " " " "*"
## 21 ( 1 ) " " " " " " "*"
## 22 ( 1 ) " " " " " " "*"
## 23 ( 1 ) " " "*" " " "*"
## 24 ( 1 ) " " "*" "*" "*"
## 25 ( 1 ) " " "*" "*" "*"
## 26 ( 1 ) "*" "*" "*" "*"
## pct_bach_deg18_24 pct_hs25_over pct_bach_deg25_over
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) " " " " " "
## 7 ( 1 ) " " " " " "
## 8 ( 1 ) " " " " " "
## 9 ( 1 ) " " " " " "
## 10 ( 1 ) " " " " " "
## 11 ( 1 ) " " " " " "
## 12 ( 1 ) " " " " "*"
## 13 ( 1 ) " " " " "*"
## 14 ( 1 ) " " " " "*"
## 15 ( 1 ) " " " " "*"
## 16 ( 1 ) " " " " "*"
## 17 ( 1 ) " " "*" "*"
## 18 ( 1 ) " " "*" "*"
## 19 ( 1 ) " " "*" "*"
## 20 ( 1 ) " " "*" "*"
## 21 ( 1 ) " " "*" "*"
## 22 ( 1 ) "*" "*" "*"
## 23 ( 1 ) "*" "*" "*"
## 24 ( 1 ) "*" "*" "*"
## 25 ( 1 ) "*" "*" "*"
## 26 ( 1 ) "*" "*" "*"
## pct_employed16_over pct_unemployed16_over pct_private_coverage
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) " " "*" " "
## 5 ( 1 ) " " "*" " "
## 6 ( 1 ) "*" " " " "
## 7 ( 1 ) "*" " " " "
## 8 ( 1 ) "*" " " " "
## 9 ( 1 ) "*" "*" " "
## 10 ( 1 ) "*" "*" " "
## 11 ( 1 ) "*" "*" " "
## 12 ( 1 ) "*" "*" " "
## 13 ( 1 ) "*" "*" " "
## 14 ( 1 ) "*" "*" " "
## 15 ( 1 ) "*" "*" " "
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
## 18 ( 1 ) "*" "*" "*"
## 19 ( 1 ) "*" "*" "*"
## 20 ( 1 ) "*" "*" "*"
## 21 ( 1 ) "*" "*" "*"
## 22 ( 1 ) "*" "*" "*"
## 23 ( 1 ) "*" "*" "*"
## 24 ( 1 ) "*" "*" "*"
## 25 ( 1 ) "*" "*" "*"
## 26 ( 1 ) "*" "*" "*"
## pct_emp_priv_coverage pct_public_coverage
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " "*"
## 7 ( 1 ) " " "*"
## 8 ( 1 ) "*" "*"
## 9 ( 1 ) "*" "*"
## 10 ( 1 ) "*" "*"
## 11 ( 1 ) "*" "*"
## 12 ( 1 ) "*" "*"
## 13 ( 1 ) "*" "*"
## 14 ( 1 ) "*" "*"
## 15 ( 1 ) "*" "*"
## 16 ( 1 ) "*" "*"
## 17 ( 1 ) "*" "*"
## 18 ( 1 ) "*" "*"
## 19 ( 1 ) "*" "*"
## 20 ( 1 ) "*" "*"
## 21 ( 1 ) "*" "*"
## 22 ( 1 ) "*" "*"
## 23 ( 1 ) "*" "*"
## 24 ( 1 ) "*" "*"
## 25 ( 1 ) "*" "*"
## 26 ( 1 ) "*" "*"
## pct_public_coverage_alone pct_white pct_married_households
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) "*" " " " "
## 7 ( 1 ) "*" " " " "
## 8 ( 1 ) "*" " " " "
## 9 ( 1 ) "*" " " " "
## 10 ( 1 ) "*" " " " "
## 11 ( 1 ) "*" "*" " "
## 12 ( 1 ) "*" " " " "
## 13 ( 1 ) "*" "*" " "
## 14 ( 1 ) "*" "*" " "
## 15 ( 1 ) "*" "*" " "
## 16 ( 1 ) "*" "*" " "
## 17 ( 1 ) "*" "*" " "
## 18 ( 1 ) "*" "*" " "
## 19 ( 1 ) "*" "*" " "
## 20 ( 1 ) "*" "*" " "
## 21 ( 1 ) "*" "*" "*"
## 22 ( 1 ) "*" "*" "*"
## 23 ( 1 ) "*" "*" "*"
## 24 ( 1 ) "*" "*" "*"
## 25 ( 1 ) "*" "*" "*"
## 26 ( 1 ) "*" "*" "*"
## birth_rate mortality prevalence pct_non_white
## 1 ( 1 ) " " "*" " " " "
## 2 ( 1 ) " " "*" " " " "
## 3 ( 1 ) " " "*" " " " "
## 4 ( 1 ) " " "*" " " " "
## 5 ( 1 ) " " "*" " " " "
## 6 ( 1 ) " " "*" " " " "
## 7 ( 1 ) " " "*" " " " "
## 8 ( 1 ) " " "*" " " " "
## 9 ( 1 ) " " "*" " " " "
## 10 ( 1 ) " " "*" "*" " "
## 11 ( 1 ) " " "*" "*" " "
## 12 ( 1 ) " " "*" "*" " "
## 13 ( 1 ) " " "*" "*" " "
## 14 ( 1 ) "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" " "
## 18 ( 1 ) "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*"
## 22 ( 1 ) "*" "*" "*" "*"
## 23 ( 1 ) "*" "*" "*" "*"
## 24 ( 1 ) "*" "*" "*" "*"
## 25 ( 1 ) "*" "*" "*" "*"
## 26 ( 1 ) "*" "*" "*" "*"
rs$cp
## [1] 6403.39877 2125.77633 1555.04810 1121.77180 829.14513 513.27874
## [7] 355.12225 261.02523 174.84358 122.66120 101.61517 73.62995
## [13] 49.11651 35.36383 23.09511 15.47010 14.82888 15.20583
## [19] 15.70549 16.32869 17.28050 19.01271 21.00207 23.00120
## [25] 25.00034 27.00000
rs$adjr2
## [1] 0.3116679 0.6284832 0.6707996 0.7029552 0.7247023 0.7481886 0.7599870
## [8] 0.7670379 0.7735059 0.7774526 0.7790881 0.7812405 0.7831363 0.7842329
## [15] 0.7852199 0.7858618 0.7859840 0.7860305 0.7860679 0.7860961 0.7860998
## [22] 0.7860453 0.7859716 0.7858971 0.7858225 0.7857479
par(mfrow=c(1,2))
plot(1:26, rs$cp, xlab="No of parameters", ylab="Cp Statistic")
abline(0,1)
plot(1:26, rs$adjr2, xlab="No of parameters", ylab="Adj R2")
best <- function(model, ...)
{
subsets <- regsubsets(formula(model), nvmax = 26, model.frame(model), ...)
subsets <- with(summary(subsets),
cbind(p = as.numeric(rownames(which)), which, rss, rsq, adjr2, cp, bic))
return(subsets)
}
round(best(canc.fit, nbest = 1), 6)
## p (Intercept) incidence_rate med_income poverty_percent
## 1 1 1 0 0 0
## 2 2 1 0 0 0
## 3 3 1 0 0 0
## 4 4 1 1 0 0
## 5 5 1 1 0 0
## 6 6 1 1 0 0
## 7 7 1 1 0 0
## 8 8 1 1 0 0
## 9 9 1 1 0 0
## 10 10 1 1 0 0
## 11 11 1 1 0 0
## 12 12 1 1 1 0
## 13 13 1 1 1 0
## 14 14 1 1 1 0
## 15 15 1 1 1 0
## 16 16 1 1 1 0
## 17 17 1 1 1 0
## 18 18 1 1 1 0
## 19 19 1 1 1 0
## 20 20 1 1 1 0
## 21 21 1 1 1 0
## 22 22 1 1 1 0
## 23 23 1 1 1 0
## 24 24 1 1 1 0
## 25 25 1 1 1 1
## 26 26 1 1 1 1
## study_per_capnone study_per_capvery high median_age_male
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7 0 0 0
## 8 0 0 0
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 0 0 0
## 15 0 0 1
## 16 0 0 1
## 17 0 0 1
## 18 0 0 1
## 19 0 1 1
## 20 1 1 1
## 21 1 1 1
## 22 1 1 1
## 23 1 1 1
## 24 1 1 1
## 25 1 1 1
## 26 1 1 1
## median_age_female avg_household_size percent_married pct_no_hs18_24
## 1 0 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 1 0 0 0
## 6 1 0 0 0
## 7 1 0 0 0
## 8 1 0 0 0
## 9 1 0 0 0
## 10 1 0 0 0
## 11 1 0 0 0
## 12 1 0 0 0
## 13 1 0 0 0
## 14 1 0 0 0
## 15 1 0 0 0
## 16 1 0 0 0
## 17 1 0 0 0
## 18 1 0 0 0
## 19 1 0 0 0
## 20 1 0 0 0
## 21 1 0 0 0
## 22 1 0 0 0
## 23 1 0 1 0
## 24 1 0 1 1
## 25 1 0 1 1
## 26 1 1 1 1
## pct_hs18_24 pct_bach_deg18_24 pct_hs25_over pct_bach_deg25_over
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 1 0 0 0
## 6 0 0 0 0
## 7 1 0 0 0
## 8 1 0 0 0
## 9 1 0 0 0
## 10 1 0 0 0
## 11 1 0 0 0
## 12 1 0 0 1
## 13 1 0 0 1
## 14 1 0 0 1
## 15 1 0 0 1
## 16 1 0 0 1
## 17 1 0 1 1
## 18 1 0 1 1
## 19 1 0 1 1
## 20 1 0 1 1
## 21 1 0 1 1
## 22 1 1 1 1
## 23 1 1 1 1
## 24 1 1 1 1
## 25 1 1 1 1
## 26 1 1 1 1
## pct_employed16_over pct_unemployed16_over pct_private_coverage
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## 4 0 1 0
## 5 0 1 0
## 6 1 0 0
## 7 1 0 0
## 8 1 0 0
## 9 1 1 0
## 10 1 1 0
## 11 1 1 0
## 12 1 1 0
## 13 1 1 0
## 14 1 1 0
## 15 1 1 0
## 16 1 1 1
## 17 1 1 1
## 18 1 1 1
## 19 1 1 1
## 20 1 1 1
## 21 1 1 1
## 22 1 1 1
## 23 1 1 1
## 24 1 1 1
## 25 1 1 1
## 26 1 1 1
## pct_emp_priv_coverage pct_public_coverage pct_public_coverage_alone
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 1 1
## 7 0 1 1
## 8 1 1 1
## 9 1 1 1
## 10 1 1 1
## 11 1 1 1
## 12 1 1 1
## 13 1 1 1
## 14 1 1 1
## 15 1 1 1
## 16 1 1 1
## 17 1 1 1
## 18 1 1 1
## 19 1 1 1
## 20 1 1 1
## 21 1 1 1
## 22 1 1 1
## 23 1 1 1
## 24 1 1 1
## 25 1 1 1
## 26 1 1 1
## pct_white pct_married_households birth_rate mortality prevalence
## 1 0 0 0 1 0
## 2 0 0 0 1 0
## 3 0 0 0 1 0
## 4 0 0 0 1 0
## 5 0 0 0 1 0
## 6 0 0 0 1 0
## 7 0 0 0 1 0
## 8 0 0 0 1 0
## 9 0 0 0 1 0
## 10 0 0 0 1 1
## 11 1 0 0 1 1
## 12 0 0 0 1 1
## 13 1 0 0 1 1
## 14 1 0 1 1 1
## 15 1 0 1 1 1
## 16 1 0 1 1 1
## 17 1 0 1 1 1
## 18 1 0 1 1 1
## 19 1 0 1 1 1
## 20 1 0 1 1 1
## 21 1 1 1 1 1
## 22 1 1 1 1 1
## 23 1 1 1 1 1
## 24 1 1 1 1 1
## 25 1 1 1 1 1
## 26 1 1 1 1 1
## pct_non_white rss rsq adjr2 cp bic
## 1 0 1502122.6 0.311906 0.311668 6403.39877 -1066.295
## 2 0 810467.6 0.628740 0.628483 2125.77633 -2844.606
## 3 0 717905.6 0.671141 0.670800 1555.04810 -3187.721
## 4 0 647558.0 0.703366 0.702955 1121.77180 -3478.311
## 5 0 599941.7 0.725178 0.724702 829.14513 -3691.449
## 6 0 548569.4 0.748711 0.748189 513.27874 -3942.634
## 7 0 522685.6 0.760568 0.759987 355.12225 -4074.590
## 8 0 507154.8 0.767682 0.767038 261.02523 -4153.943
## 9 0 492903.2 0.774210 0.773506 174.84358 -4228.490
## 10 0 484146.5 0.778222 0.777453 122.66120 -4272.413
## 11 0 480421.9 0.779928 0.779088 101.61517 -4286.800
## 12 0 475575.8 0.782148 0.781241 73.62995 -4308.180
## 13 0 471290.8 0.784111 0.783136 49.11651 -4326.412
## 14 0 468744.9 0.785277 0.784233 35.36383 -4334.122
## 15 0 466438.8 0.786333 0.785220 23.09511 -4340.429
## 16 0 464883.3 0.787046 0.785862 15.47010 -4342.129
## 17 0 464456.4 0.787241 0.785984 14.82888 -4336.818
## 18 1 464194.1 0.787361 0.786031 15.20583 -4330.482
## 19 1 463951.6 0.787472 0.786068 15.70549 -4324.024
## 20 1 463729.1 0.787574 0.786096 16.32869 -4317.442
## 21 1 463559.7 0.787652 0.786100 17.28050 -4310.529
## 22 1 463516.4 0.787672 0.786045 19.01271 -4302.829
## 23 1 463514.7 0.787673 0.785972 21.00207 -4294.869
## 24 1 463514.6 0.787673 0.785897 23.00120 -4286.899
## 25 1 463514.4 0.787673 0.785823 25.00034 -4278.929
## 26 1 463514.4 0.787673 0.785748 27.00000 -4270.959
17 parameter model minimizes Cp, while 21 parameter model minimizes adjR2.
Cp.fit <- lm(target_death_rate ~ incidence_rate + med_income + median_age_male + median_age_female + pct_hs18_24
+ pct_hs25_over + pct_bach_deg25_over + pct_employed16_over + pct_unemployed16_over
+ pct_private_coverage + pct_emp_priv_coverage + pct_public_coverage
+ pct_public_coverage_alone + pct_white + birth_rate + mortality + prevalence, data = cancer_reg)
adjR2.fit <- lm(target_death_rate ~ incidence_rate + med_income + study_per_cap + median_age_male
+ median_age_female + pct_hs18_24 + pct_hs25_over + pct_bach_deg25_over + pct_employed16_over
+ pct_unemployed16_over + pct_private_coverage + pct_emp_priv_coverage + pct_public_coverage
+ pct_public_coverage_alone + pct_white + pct_married_households + birth_rate + mortality
+ prevalence + pct_non_white, data = cancer_reg)